Statement Of Contribution:

Assignment 1: Dinesh Sundaramoorthy (dinsu875)
Assignment 2: Jin Yan (jinya425)

Assignment One

1.The resulting PDF picture is as follows.

Tree
Tree

Assignment Two

2.1

my_data <- read.delim("SENIC.txt", header = FALSE, sep = "")
colnames(my_data)=c('ID','X1','X2','X3','X4','X5','X6','X7','X8','X9','X10','X11')

2.2

f_quantile <- function(x){
  Q1 <- quantile(x, probs = 0.25)
  Q3 <- quantile(x, probs = 0.75)
  indices <- which(x > Q3+1.5*(Q3-Q1) | x < Q1-1.5*(Q3-Q1))
  return(indices)
}

2.3

# get the outliers
outliers = my_data$X3[f_quantile(my_data$X3)]
data_outliers <- data.frame(x=outliers,y= rep(0,length(outliers))) # get a data.frame whose y values are zeros.

plot1 <- ggplot(my_data, aes(x = X3)) + geom_density()+ geom_point(data = data_outliers,aes(x= x,y=y),shape=5)
plot1

From the graph, we can see that most of values cluster around 4.5. It is pretty like gaussian distribution.

2.4

names <- colnames(my_data[,-c(1,8,9)]) #find the names of needed columns

plot_list = list() # there must be a list to save all of the graphs
for(name in names){
  if(name != 'X11'){
    outliers = my_data[,name][f_quantile(my_data[,name])]
    data_outliers <- data.frame(x=outliers,y=0)
    plot_temp <- ggplot(my_data, aes_string(x = name)) +  stat_density() + geom_point(data = data_outliers, aes(x = x, y= y),shape=5)
  # for 'X11', there is no outliers, so we should not add the following part, in order to avoid mistakes.
  }else{ 
    plot_temp <- ggplot(my_data, aes_string(x = name)) +  stat_density()
   }
      
  plot_list[[name]] = plot_temp
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
combined_plots <- grid.arrange(grobs = plot_list, ncol = 3,nrow=3)

From the graph we can see that not all features in dataset belongs to gaussian distribution. Also, some of them are related, since they show similar patterns.

2.5

plot2 <- ggplot(my_data, aes(x=X3, y=X10, colour= X6)) + geom_point()
print(plot2)

From the graph we can see as the number of beds increases, the number of nurse increases. The possible problem may be that color scale is too large as for this dataset. I think it should be more granular to make it clear. This way we may find why with similar number of nurses and beds, the infection risk is so different for some points.

2.6

ggplotly(plot1)

By comparison, the plot generated by ggplotly can give us exact density information when hovering.

2.7

outliers = my_data$X3[f_quantile(my_data$X3)]
plot3 <- my_data %>% select('X3') %>% plot_ly(x=~X3) %>% # the following is to create parallel branches.
  add_fun(function(plot) {
    plot %>% add_histogram()
  }) %>%
  filter(is.element(X3,outliers)) %>%
  add_trace(type='scatter', mode = 'markers',marker = list(symbol = "diamond"))

plot3

2.8

library(shiny)
# source("helper.R")
my_data <- read.delim("SENIC.txt", header = FALSE, sep = "")
colnames(my_data)=c('ID','X1','X2','X3','X4','X5','X6','X7','X8','X9','X10','X11')

f_quantile <- function(x){
  Q1 <- quantile(x, probs = 0.25)
  Q3 <- quantile(x, probs = 0.75)
  indices <- which(x > Q3+1.5*(Q3-Q1) | x < Q1-1.5*(Q3-Q1))
  return(indices)
}
library(ggplot2)

plot <- function(name,bw){
  if(name != 'X11'){
    outliers = my_data[,name][f_quantile(my_data[,name])]
    data_outliers <- data.frame(x=outliers,y=0)
    ggplot(my_data, aes_string(x = name)) +  stat_density(bw=bw) + geom_point(data = data_outliers, aes(x = x, y= y),shape=5)
    # for 'X11' there is no outliers, so we should not add the latter part, in order to avoid mistakes.
  }else{ 
    ggplot(my_data, aes_string(x = name)) +  stat_density(bw=bw)
  }
}

# Define UI

ui <- fluidPage(
  
  titlePanel("shiny_for_visualization"),
  
  sidebarLayout(
    sidebarPanel(
      helpText("Choose different parameters to create plots"),
      
      checkboxGroupInput("variables","Choose variable:",
                         choiceNames = list("X1","X2",  "X3",  "X4",  "X5",  "X6",  "X9",  "X10", "X11"),
                         choiceValues = list("X1","X2",  "X3",  "X4",  "X5",  "X6",  "X9",  "X10", "X11"),
                         selected = "X1"
                         ),
      
      sliderInput("range",strong("choose bandwidth size"), value = 0.1, min = 0.1, max = 1),
      
      
    
    ),
    mainPanel(plotOutput("selected_plot"))
  )
)

server <- function(input, output){
  
  output$selected_plot <- renderPlot({
    plot =list()
    select_val <- input$variables
    for( variable in select_val){
      plot[[variable]] <- plot(variable, input$range)
    }
    grid.arrange(grobs = plot, nrow=length(select_val))
  
})
}

shinyApp(ui <- ui,server <- server)
Shiny applications not supported in static R Markdown documents

I think bw=1 is a good setting, since we can find the general distribution of different variables, and this is the main goal of getting a density graph.